In [1]:
import json
import ast
def get_node_dictionary():
infile = open('/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/Posts_FullInfoSmall.xml')
count = 0
node_counter = {}
total = 0
for line in infile:
count += 1
try:
line_obj = json.loads(line)
except:
continue
for code_block in line_obj['CodeBlocks']:
if code_block['Guesslang'].lower().strip() == 'python' and code_block['Parsable'] == 'True':
code = code_block['code']
try:
a = ast.parse(code)
except:
continue
total+=1
for l in ast.walk(a):
node_name = str(l).split()[0][2:]
if node_name not in node_counter:
node_counter[node_name] = 0
node_counter[node_name] += 1
if count % 10000 == 0:
print(count)
print('\t',total)
return node_counter
In [2]:
nc = get_node_dictionary()
In [3]:
nc.keys()
Out[3]:
In [6]:
new_nc = {}
for key in nc:
new_nc[key[4:]] = nc[key]
In [9]:
total = 0
for el in new_nc:
total += new_nc[el]
for el in new_nc:
new_nc[el] = new_nc[el]/total
In [24]:
print(' Node name Percentage of total nodes')
cat_set = set()
for category in categories:
print(category[0])
total_num = 0
for el in category[1]:
if el not in new_nc:
cat_set.add(el)
extended_el = el + ' '*(15-len(el))
print('\t',extended_el,':\t',0)
print('\t' + '-'*30)
continue
total_num += new_nc[el]
extended_el = el + ' '*(15-len(el))
print('\t',extended_el,':\t',"%.3f" % new_nc[el])
print('\t' + '-'*30)
cat_set.add(el)
print('\tTOTAL: ',"%.3f" % total_num)
print('*'*50)
print('Misc')
total_num = 0
for el in new_nc.keys():
if el not in cat_set:
extended_el = el + ' '*(15-len(el))
print('\t',extended_el,':\t',"%.3f" % new_nc[el])
print('\t' + '-'*30)
total_num += new_nc[el]
print('\tTOTAL: ',"%.3f" % total_num)
In [4]:
categories = [
('Literals', [
'Num',
'Str',
'FormattedValue',
'JoinedStr',
]),
('Variables', [
'Name',
'NameConstant',
'Starred',
]),
('UnaryOps', [
'UnaryOp',
'UAdd',
'USub',
'Not',
'Invert'
]),
('Math', [
'Add',
'Sub',
'Mult',
'Div',
'FloorDiv',
'Mod',
'Pow'
]),
('Binary Ops', [
'LShift',
'RShift',
'BitOr',
'BitXor',
'BitAnd'
]),
('BoolOp', [
'BoolOp',
'And',
'Or'
]),
('Compare', [
'Compare',
'Eq',
'NotEq',
'Lt',
'LtE',
'Gt',
'GtE',
'Is',
'IsNot',
'In',
'NotIn',
]),
('Subscripting', [
'Subscript',
'Index',
'Slice',
'ExtSlice',
]),
]
In [ ]: